# _*_ coding:utf-8 _*_

__author__ = 'Jacob yan'
__date__ = '2018/7/11'

'''
    生成中文词云,同时打印出词频信息
'''

import io
from os import path
from scipy.misc import imread
import matplotlib.pyplot as plt
import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from collections import Counter


stopwords = {}


def importStopword(filename=''):
    global stopwords
    f = io.open(filename, 'r', encoding='utf-8')
    line = f.readline().rstrip()
    while line:
        stopwords.setdefault(line, 0)
        stopwords[line] = 1
        line = f.readline().rstrip()
    f.close()


def processChinese(text, auto_generate='Auto'):
    if auto_generate == 'Auto':
        seg_generator = jieba.cut(text)  # 使用结巴分词，也可以不使用
        seg_list = [i for i in seg_generator if i not in stopwords]
        seg_list = [i for i in seg_list if i != u' ']
        seg_list = r' '.join(seg_list)
        return seg_list
    elif auto_generate == 'Manual':
        return text


def colorGenerator(color):
    if color == "Background":
        image_colors = ImageColorGenerator(back_coloring)
        plt.imshow(wc.recolor(color_func=image_colors))
    elif color == "Random":
        plt.imshow(wc.recolor(colormap=plt.cm.Dark2))


if __name__ == "__main__":
    d = path.dirname(__file__)
    with io.open('content.txt', 'r', encoding='utf-8') as outf:
        text = outf.read()


    importStopword(filename='./stopwords.txt')
    # text = processChinese(text, 'Manual')  # 手动分词
    text = processChinese(text,'Auto')  # 自动分词


    back_coloring = imread(path.join(d, "./image_backgroud/alice_color.png"))
    wc = WordCloud(font_path='./font/simhei.ttf',  # 设置字体
                   background_color="white",  # 背景颜色
                   # background_color="black",  # 背景颜色
                   max_words=2000,  # 词云显示的最大词数
                   mask=back_coloring,  # 设置背景图片
                   # max_font_size=100, #字体最大值
                   random_state=42,
                   )


    # 获得词频数据
    word_freq = wc.process_text(text)
    word_freq_list = sorted(
        word_freq.items(), key=lambda item: item[1], reverse=True)
    for item in word_freq_list:
        print "{0:3} -- {1:10}".format(item[1], item[0].encode('utf-8'))


    # 生成词云
    wc.generate(text)

    plt.figure(figsize=(8, 8), facecolor='white')
    colorGenerator('Random')
    plt.axis("off")
    plt.show()

    wc.to_file(path.join(d, "./ouput/output.png"))
